/* Copyright 2012 Tobias Marschall
 * 
 * This file is part of CLEVER.
 * 
 * CLEVER is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * CLEVER is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with CLEVER.  If not, see <http://www.gnu.org/licenses/>.
 */

#include <iostream>
#include <fstream>
#include <fstream>
#include <vector>
#include <cassert>

#include <boost/program_options.hpp>
#include <bamtools/api/BamReader.h>

#include "Histogram.h"
#include "HistogramBasedDistribution.h"
#include "GroupWiseBamReader.h"
#include "SortedBamReader.h"
#include "VersionInfo.h"
#include "ReadGroups.h"

using namespace std;
namespace po = boost::program_options;

void usage(const char* name, const po::options_description& options_desc) {
	cerr << "Usage: " << name << " [options]" << endl;
	cerr << endl;
	cerr << "Reads BAM format from stdin and outputs histogram of insert lengths" << endl;
	cerr << "of UNIQUELY MAPPED reads. Here, insert length refers to the area between" << endl;
	cerr << "two alignments (excluding the alignments)." << endl;
	cerr << endl;
	cerr << "IMPORTANT: Assumes that, in the BAM input, alignments belonging to the same read (pair)" << endl;
	cerr << "are grouped, i.e. are given in subsequent lines. If BAM file is sorted by position," << endl;
	cerr << "please use option --sorted." << endl;
	cerr << endl;
	cerr << options_desc << endl;
	exit(1);
}

bool is_clipped(const BamTools::BamAlignment& alignment) {
	vector<BamTools::CigarOp>::const_iterator it = alignment.CigarData.begin();
	for (;it!=alignment.CigarData.end(); ++it) {
		switch (it->Type) {
		case 'S':
		case 'H':
		case 'P':
			return true;
		default:
			continue;
		}
	}
	return false;
}

void process_read(const vector<BamTools::BamAlignment*>& alignments1, const vector<BamTools::BamAlignment*>& alignments2, Histogram* histogram) {
	if ((alignments1.size()==1) && (alignments2.size()==1) && (alignments1[0]->IsReverseStrand()!=alignments2[0]->IsReverseStrand()) && (alignments1[0]->RefID==alignments2[0]->RefID)) {
		int insert_size = 0;
		if (alignments1[0]->Position <= alignments2[0]->Position) {
			insert_size = alignments2[0]->Position - alignments1[0]->GetEndPosition();
		} else {
			insert_size = alignments1[0]->Position - alignments2[0]->GetEndPosition();
		}
		histogram->add(insert_size);
	}
}

bool is_unique(const BamTools::BamAlignment& alignment) {
	if (alignment.MapQuality == 0) return false;
	if (!alignment.IsPrimaryAlignment()) return false;
	uint32_t x0_tag = -1;
	uint32_t x1_tag = -1;
	if (alignment.GetTag("X0",x0_tag)) {
		if (x0_tag>1) return false;
	}
	if (alignment.GetTag("X1",x1_tag)) {
		if (x1_tag>0) return false;
	}
	return true;
}


bool output_distribution(Histogram* histogram, ostream& message_out, ostream& histogram_out, ostream* mean_and_sd_out, int min_counts_per_bin, bool gaussian_values) {
	if (histogram->size() < 100) {
		message_out << "Too few unique read pairs (" << histogram->size() << "). Skipping (not outputing distribution)." << endl;
		return false;
	}
	message_out << "Using " << histogram->size() << " unique read pairs." << endl;
	double mean;
	double sd;
	histogram->computeMeanAndStddev(&mean, &sd);
	message_out << "Main peak of internal segment length distribution: mean " << mean << ", sd " << sd << endl;
	if (mean_and_sd_out != 0) {
		(*mean_and_sd_out) << mean << " " << sd << endl;
	}
	auto_ptr<HistogramBasedDistribution> ild = histogram->toDistribution(min_counts_per_bin);
	if (gaussian_values) {
		ild->printWithGaussian(histogram_out, mean, sd);
	} else {
		histogram_out << *ild;
	}
	return true;
}

string open_wildcard(const string& filename, string wildcard, string value) {
	string result(filename);
	size_t pos = 0;
	while (true) {
		pos = result.find(wildcard, pos);
		if (pos == string::npos) break;
		result.replace(pos, wildcard.size(), value);
		pos += value.length();
	}
	return result;
}

int main(int argc, char* argv[]) {
	VersionInfo::checkAndPrintVersion("insert-length-histogram", cerr);
	string commandline = VersionInfo::commandline(argc, argv);

	// PARAMETERS
	int min_counts_per_bin = 20;
	string insert_length_histogram = "";
	string mean_sd_filename = "";
	bool gaussian_values = false;
	bool sorted_bam = false;
	bool all = false;
	long long count;
	string output_filename = "";
	bool readgroupwise = false;
	string rglist_filename = "";

	po::options_description options_desc("Allowed options");
	options_desc.add_options()
		("min_count_per_bin,c", po::value<int>(&min_counts_per_bin)->default_value(20), "Minimum number of counts per bin. If necessary, bins are joined to reach this number of counts.")
		("mean_and_sd,m", po::value<string>(&mean_sd_filename), "Write (robustly estimated) mean and standard deviation of main peak to given filename. If used together with option -R, the filename must contain the wildcard '{readgroup}' which is replaced by the actual read group.")
		("gaussian_values,g", po::value<bool>(&gaussian_values)->zero_tokens(), "Print forth column: probability based on Gaussian estimated from main peak.")
		("sorted", po::value<bool>(&sorted_bam)->zero_tokens(), "Input BAM file is sorted by position")
		("count,C", po::value<long long>(&count)->default_value(1000000), "Number of uniquely mappable read pairs to process.")
		("all", po::value<bool>(&all)->zero_tokens(), "Read all alignments rather than only the number given in option -C")
		("output,o", po::value<string>(&output_filename)->default_value(""), "Filename for output to be written to (default: stdout). If used together with option -R, the filename must contain the wildcard '{readgroup}' which is replaced by the actual read group.")
		("readgroupwise,R", po::value<bool>(&readgroupwise)->zero_tokens(), "Report a separate distributions for each read group. Must be used in connection with option -o.")
		("readgrouplist,L", po::value<string>(&rglist_filename)->default_value(""), "Write list of encountered read groups to given filename. Format: <readgroup> <distribution-filename>.")
	;
	
	if (isatty(fileno(stdin))) {
		usage(argv[0], options_desc);
	}

	po::variables_map options;
	try {
		po::store(po::parse_command_line(argc, argv, options_desc), options);
		po::notify(options);
	} catch(exception& e) {
		cerr << "Error: " << e.what() << "\n";
		return 1;
	}
	cerr << "Commandline: " << commandline << endl;

	if (min_counts_per_bin<1) {
		cerr << "Error: option -c: integer >0 expected, but got " << min_counts_per_bin << "." << endl;
		return 1;
	}

	if (readgroupwise) {
		if (output_filename.size() == 0) {
			cerr << "Error: Option -R must be used together with option -o." << endl;
			return 1;
		}
		if (output_filename.find("{readgroup}") == string::npos) {
			cerr << "Error: If option -R is used, argument to option -o must contain the string '{readgroup}'" << endl;
			return 1;
		}
		if (mean_sd_filename.size() > 0) {
			if (mean_sd_filename.find("{readgroup}") == string::npos) {
				cerr << "Error: If option -R is used, argument to option -m must contain the string '{readgroup}'" << endl;
				return 1;
			}
		}
	}

	if ((rglist_filename.size()>0) && (!readgroupwise)) {
		cerr << "Error: Option -L can only be used together with opion -R." << endl;
		return 1;
	}

	ReadGroups* readgroups = 0;
	BamReader* bam_reader = 0;
	// number of alignments used in histogram
	long long total_count = 0;
	// actual histogram data
	Histogram* histogram = 0;
	vector<Histogram*> rgwise_histograms;
	try {
		if (sorted_bam) {
			bam_reader = new SortedBamReader("/dev/stdin", true, 50000, false);
		} else {
			bam_reader = new GroupWiseBamReader("/dev/stdin", true);
		}
		bam_reader->enableProgressMessages(cerr, 1000000);
		if (readgroupwise) {
			readgroups = new ReadGroups(bam_reader->getHeader().ReadGroups, false);
			for (size_t i=0; i<readgroups->size(); ++i) {
				rgwise_histograms.push_back(new Histogram());
			}
		} else {
			histogram = new Histogram();
		}
		while ( bam_reader->hasNext() ) {
			bam_reader->advance();
			if (bam_reader->isFirstUnmapped() || bam_reader->isSecondUnmapped()) continue;
			const vector<BamTools::BamAlignment*>& alignments1 = bam_reader->getAlignmentsFirst();
			const vector<BamTools::BamAlignment*>& alignments2 = bam_reader->getAlignmentsSecond();
			if ((alignments1.size() != 1) || (alignments2.size() != 1)) continue;
			if (alignments1[0]->IsReverseStrand() == alignments2[0]->IsReverseStrand()) continue;
			if (alignments1[0]->RefID != alignments2[0]->RefID) continue;
			if (is_clipped(*(alignments1[0])) || is_clipped(*(alignments2[0]))) continue;
			if (!is_unique(*(alignments1[0])) || !is_unique(*(alignments2[0]))) continue;
			total_count += 1;
			if (readgroupwise) {
				size_t rg_index1 = readgroups->getIndex(*(alignments1[0]));
				size_t rg_index2 = readgroups->getIndex(*(alignments2[0]));
				if (rg_index1 != rg_index2) {
					cerr << "Error: read group different for two reads in a pair. Offending read: " << alignments1[0]->Name << endl;
					return 1;
				}
				assert(rg_index1 < rgwise_histograms.size());
				process_read(alignments1, alignments2, rgwise_histograms[rg_index1]);
			} else {
				process_read(alignments1, alignments2, histogram);
			}
			if (!all && (total_count >= count)) {
				break;
			}
		}
	} catch(exception& e) {
		cerr << "Error: " << e.what() << "\n";
		return 1;
	}
	delete bam_reader;

	if (readgroupwise) {
		ofstream* rglist_out = 0;
		if (rglist_filename.size()>0) {
			rglist_out = new ofstream(rglist_filename.c_str());
		}
		for (size_t rg_index = 0; rg_index < readgroups->size(); ++rg_index) {
			string rg_name = readgroups->getName(rg_index);
			string f = open_wildcard(output_filename, "{readgroup}", rg_name);
			if (rglist_out != 0) {
				(*rglist_out) << rg_name << ' ' << f << endl;
			}
			ofstream* output = new ofstream(f.c_str());
			ofstream* mean_and_sd_out = 0;
			if (mean_sd_filename.compare("") != 0) {
				mean_and_sd_out = new ofstream(open_wildcard(mean_sd_filename, "{readgroup}", rg_name).c_str());
			}
			cerr << "------------ Read group " << rg_name << "------------" << endl;
			output_distribution(rgwise_histograms[rg_index], cerr, *output, mean_and_sd_out, min_counts_per_bin, gaussian_values);
			if (output != 0) delete output;
			if (mean_and_sd_out != 0) delete mean_and_sd_out;
		}
		if (rglist_out != 0) delete rglist_out;
	} else {
		ofstream* output = 0;
		if (output_filename.size() > 0) {
			output = new ofstream(output_filename.c_str());
		}
		ofstream* mean_and_sd_out = 0;
		if (mean_sd_filename.compare("") != 0) {
			mean_and_sd_out = new ofstream(mean_sd_filename.c_str());
		}
		if (!output_distribution(histogram, cerr, (output==0?cout:*output), mean_and_sd_out, min_counts_per_bin, gaussian_values)) {
			return 1;
		}
		if (output != 0) delete output;
		if (mean_and_sd_out != 0) delete mean_and_sd_out;
	}

	return 0;
}
